/* * Copyright 1998-2014 University Corporation for Atmospheric Research/Unidata * * Portions of this software were developed by the Unidata Program at the * University Corporation for Atmospheric Research. * * Access and use of this software shall impose the following obligations * and understandings on the user. The user is granted the right, without * any fee or cost, to use, copy, modify, alter, enhance and distribute * this software, and any derivative works thereof, and its supporting * documentation for any purpose whatsoever, provided that this entire * notice appears in all copies of the software, derivative works and * supporting documentation. Further, UCAR requests that the user credit * UCAR/Unidata in any publications that result from the use of this * software or in any product that includes this software. The names UCAR * and/or Unidata, however, may not be used in any advertising or publicity * to endorse or promote any products or commercial entity unless specific * written permission is obtained from UCAR/Unidata. The user also * understands that UCAR/Unidata is not obligated to provide the user with * any support, consulting, training or assistance of any kind with regard * to the use, operation and performance of this software nor to provide * the user with any updates, revisions, new versions or "bug fixes." * * THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL, * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION * WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE. */ package ucar.nc2.grib.grib2.table; import org.jdom2.output.Format; import org.jdom2.output.XMLOutputter; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import ucar.nc2.constants.CDM; import ucar.unidata.util.StringUtil2; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.Formatter; import java.util.List; /** * Read NCEP html files to extract the GRIB-2 tables. * * @author caron * @since 1/7/12 */ public class NcepHtmlScraper { String dirOut; static private final boolean debugParam = false; static private final boolean debug = false; static private final boolean show = false; public void setDirOut(String dirOut) { this.dirOut = dirOut; } ////////////////////////////////////////////////////////////////// // http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_doc.shtml void parseTopDoc() throws IOException { String source = "http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_doc.shtml"; Document doc = Jsoup.parse(new URL(source), 5 * 1000); // 5 sec timeout //System.out.printf("%s%n", doc); Elements links = doc.select("a[href]"); for (Element link : links) { //System.out.printf("%s", link); Node sib = link.nextSibling(); String title = null; if (sib != null) { String sibt = sib.toString(); title = StringUtil2.remove(sibt, "-").trim(); //System.out.printf(" == '%s'", title); } if (link.text().equals("Table 4.2")) { //System.out.printf(" == "); parseTable42(link.attr("abs:href"), link.text(), title); } else { if (link.text().startsWith("Table 4")) { //System.out.printf(" == "); parseCodeTable(link.attr("abs:href"), link.text(), title); } } //System.out.printf("%n"); } } void parseCodeTable(String url, String tableName, String title) throws IOException { System.out.printf("parseCodeTable url=%s tableName=%s title=%s%n", url, tableName, title); Document doc = Jsoup.parse(new URL(url), 5 * 1000); // 5 sec timeout //System.out.printf("%s%n", doc); if (title == null) title = "NCEP GRIB-2 Code Table"; // System.out.printf("%s%n", title); Element table = doc.select("table").first(); List<Code> stuff = new ArrayList<>(); Elements rows = table.select("tr"); for (Element row : rows) { Elements cols = row.select("td"); if (debug) { System.out.printf(" #cols=%d: ", cols.size()); for (Element col : cols) System.out.printf("%s:", col.text()); System.out.printf("%n"); } if (cols.size() >= 2) { String snum = StringUtil2.cleanup(cols.get(0).text()).trim(); String desc = StringUtil2.cleanup(cols.get(1).text()).trim(); if (snum.contains("Reserved") || desc.contains("Reserved") ) { if (debug) System.out.printf("*** Skip Reserved %s%n", row.text()); continue; } try { int pnum = Integer.parseInt(snum); if (debug) System.out.printf("val %d == %s%n", pnum, desc); stuff.add(new Code(pnum, desc)); } catch (NumberFormatException e) { System.out.printf("*** Cant parse %s == %s%n", snum, row.text()); } } } String filename = StringUtil2.removeWhitespace(tableName); writeCodeTableXml(filename, title, url, tableName, stuff); } private static class Code { int no; String desc; private Code(int no, String desc) { this.no = no; this.desc = desc; } } // writeCodeTableXml(filename, title, url, tableName, stuff); private void writeCodeTableXml(String filename, String title, String source, String tableName, List<Code> stuff) throws IOException { org.jdom2.Element rootElem = new org.jdom2.Element("codeTable"); org.jdom2.Document doc = new org.jdom2.Document(rootElem); rootElem.addContent(new org.jdom2.Element("table").setText(tableName)); rootElem.addContent(new org.jdom2.Element("title").setText(title)); rootElem.addContent(new org.jdom2.Element("source").setText(source)); for (Code p : stuff) { org.jdom2.Element paramElem = new org.jdom2.Element("parameter"); paramElem.setAttribute("code", Integer.toString(p.no)); paramElem.addContent(new org.jdom2.Element("description").setText(p.desc)); rootElem.addContent(paramElem); } XMLOutputter fmt = new XMLOutputter(Format.getPrettyFormat()); String x = fmt.outputString(doc); try (FileOutputStream fout = new FileOutputStream(dirOut + filename+".xml")) { fout.write(x.getBytes(CDM.utf8Charset)); } if (show) System.out.printf("%s%n", x); } /////////////////////////////////////////////////////////////////////// void parseTable42(String url, String tableName, String title) throws IOException { System.out.printf("parseTable42 url=%s tableName=%s title=%s%n", url, tableName, title); Document doc = Jsoup.parse(new URL(url), 5 * 1000); // 5 sec timeout //System.out.printf("%s%n", doc); Elements links = doc.select("a[href]"); for (Element link : links) { //System.out.printf("link = %s%n", link); //for (Node sib : link.siblingNodes()) System.out.printf(" %s%n", sib); //System.out.printf("%n"); parseParamTable(link.attr("abs:href"), link.text()); } } void parseParamTable(String url, String title) throws IOException { String match = "grib2_table4"; if (!url.contains(match)) return; System.out.printf("parseParamTable url=%s title=%s%n", url, title); Document doc = Jsoup.parse(new URL(url), 5 * 1000); // 5 sec timeout //System.out.printf("%s%n", doc); if (title == null) title = "NCEP GRIB-2 Param Table"; // System.out.printf("%s%n", title); Element table = doc.select("table").first(); assert table != null; List<Param> stuff = new ArrayList<>(); Elements rows = table.select("tr"); for (Element row : rows) { Elements cols = row.select("td"); if (debugParam) { System.out.printf(" #cols=%d: ", cols.size()); for (Element col : cols) System.out.printf("%s:", col.text()); System.out.printf("%n"); } if (cols.size() == 4) { String snum = StringUtil2.cleanup(cols.get(0).text()).trim(); String desc = StringUtil2.cleanup(cols.get(1).text()).trim(); if (snum.contains("Reserved") || desc.contains("Reserved") || desc.contains("Missing") ) { if (debugParam) System.out.printf("*** Skip Reserved %s%n", row.text()); continue; } try { int pnum = Integer.parseInt(snum); String units = cols.get(2).text(); String abbrev = cols.get(3).text(); if (debugParam) System.out.printf("val %d == %s %s %s%n", pnum, desc, units, abbrev); stuff.add(new Param(pnum, desc, units, abbrev)); } catch (NumberFormatException e) { System.out.printf("*** Cant parse %s == %s%n", snum, row.text()); } } else if (cols.size() == 3) { String snum = StringUtil2.cleanup(cols.get(0).text()).trim(); String desc = StringUtil2.cleanup(cols.get(1).text()).trim(); if (snum.contains("Reserved") || desc.contains("Reserved") || desc.contains("Missing") ) { if (debugParam) System.out.printf("*** Skip Reserved %s%n", row.text()); continue; } try { int pnum = Integer.parseInt(snum); String units = cols.get(2).text(); if (debugParam) System.out.printf("val %d == %s %s%n", pnum, desc, units); stuff.add(new Param(pnum, desc, units, null)); } catch (NumberFormatException e) { System.out.printf("*** Cant parse %s == %s%n", snum, row.text()); } } } // grib2_table4-2-0-0.shtml int pos = url.indexOf(match) + match.length(); int lastPos = url.lastIndexOf('.'); String filename = "Table4" + url.substring(pos, lastPos); filename = StringUtil2.removeWhitespace(filename); filename = StringUtil2.substitute(filename,"-", "."); writeParamTableXml(filename, title, url, filename, stuff); } private static class Param { int pnum; String desc, unit, name; private Param(int pnum, String desc, String unit, String name) { this.pnum = pnum; this.desc = desc; this.unit = StringUtil2.cleanup(unit); this.name = StringUtil2.cleanup(name); } } // writeCodeTableXml(filename, title, url, tableName, stuff); private void writeParamTableXml(String filename, String title, String source, String tableName, List<Param> stuff) throws IOException { org.jdom2.Element rootElem = new org.jdom2.Element("parameterMap"); org.jdom2.Document doc = new org.jdom2.Document(rootElem); rootElem.addContent(new org.jdom2.Element("table").setText(tableName)); rootElem.addContent(new org.jdom2.Element("title").setText(title)); rootElem.addContent(new org.jdom2.Element("source").setText(source)); for (Param p : stuff) { org.jdom2.Element paramElem = new org.jdom2.Element("parameter"); paramElem.setAttribute("code", Integer.toString(p.pnum)); paramElem.addContent(new org.jdom2.Element("shortName").setText(p.name)); paramElem.addContent(new org.jdom2.Element("description").setText(p.desc)); paramElem.addContent(new org.jdom2.Element("units").setText(p.unit)); rootElem.addContent(paramElem); } XMLOutputter fmt = new XMLOutputter(Format.getPrettyFormat()); String x = fmt.outputString(doc); try (FileOutputStream fout = new FileOutputStream(dirOut + filename+".xml")) { fout.write(x.getBytes(CDM.utf8Charset)); } if (show) System.out.printf("%s%n", x); } private void writeTable2Wgrib(String name, String source, String filename, List<Param> params) throws IOException { Formatter f = new Formatter(); f.format("# %s%n", name); f.format("# %s%n", source); for (Param p : params) f.format("%3d:%s:%s [%s]%n", p.pnum, p.name, p.desc, p.unit); // 1:PRES:Pressure [Pa] try (FileOutputStream fout = new FileOutputStream(dirOut + filename)) { fout.write(f.toString().getBytes(CDM.utf8Charset)); } if (show) System.out.printf("%s%n", f); } // C:\dev\github\thredds\grib\src\main\resources\resources\grib2\ncep public static void main(String[] args) throws IOException { //String dirOut = "C:\\dev\\github\\thredds\\grib\\src\\main\\sources\\ncep\\temp\\"; NcepHtmlScraper scraper = new NcepHtmlScraper(); // set temp dir for new grib2 table info String dirOut = "/Users/sarms/Desktop/ncep/grib2/"; scraper.setDirOut(dirOut); File dir = new File(dirOut); if (!dir.mkdirs()) System.out.printf("mkdir %s failed %n", dir.getPath()); scraper.parseTopDoc(); } }